import numpy as np
import pandas as pd
import scipy as sp
from scipy.stats import mode
from sklearn import linear_model
import matplotlib
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
%matplotlib inline
import datetime
import statsmodels.api as sm
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sb
sb.set_style('darkgrid')
from scipy import stats
from statsmodels.graphics.api import qqplot
from sklearn.metrics import r2_score
from sklearn import model_selection
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')
daily_data = pd.read_csv('../exchangeratedata/daily_full.csv', skiprows=7, header=0)
monthly_data = pd.read_csv('../exchangeratedata/monthly_rates.csv', skiprows=11, header=0)
daily_data['datetime'] = pd.to_datetime(daily_data['DATE'])
daily_data['dayofweek'] = daily_data['datetime'].apply(lambda row: row.dayofweek)
weekly_data = daily_data[daily_data['dayofweek'] == 4]
weekly_data.dtypes
print weekly_data.head(5)
UK_US = pd.DataFrame()
UK_US['UK_US']=weekly_data['XUDLUSS']
UK_US['LIBOR']=weekly_data['IUDSOIA']
UK_US = UK_US.set_index(weekly_data['datetime'].values) # index has to be the variable depicting date-time information
UK_EU = pd.DataFrame()
UK_EU['UK_EU']=weekly_data['XUDLERS']
UK_EU['LIBOR']=weekly_data['IUDSOIA']
UK_EU.set_index(weekly_data['datetime'].values)
# dates = pd.DatetimeIndex(UK_US.index)
# dates[:'2000-03-10']
ts = UK_US['UK_US']
libor = UK_US['LIBOR']
ts['2000-01-07']
libor['2000-01-07']
Can assume the series to be stationary if it has constant statistical properties over time, ie. the following:
Determine stationarity using
plt.figure(figsize=(10, 5))
plt.plot(UK_US['UK_US'])
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = pd.rolling_mean(timeseries, window=12)
rolstd = pd.rolling_std(timeseries, window=12)
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print 'Results of Dickey-Fuller Test:'
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print dfoutput
test_stationarity(UK_US['UK_US'])
test_stationarity(UK_EU['UK_EU'])
For both series, the test statistic is more than the critical values so the series are not stationary. Note that the signed values should be compared and not the absolute values.
There are 2 major reasons behind non-stationaruty of a TS:
The underlying principle is to model or estimate the trend and seasonality in the series and remove those from the series to get a stationary series. Then statistical forecasting techniques can be implemented on this series. The final step would be to convert the forecasted values into the original scale by applying trend and seasonality constraints back.
ts = UK_US['UK_US']
ts_log = np.log(UK_US['UK_US'])
plt.plot(ts_log)
moving_avg = pd.rolling_mean(ts_log,12)
plt.plot(ts_log)
plt.plot(moving_avg, color='red')
ts_log_moving_avg_diff = ts_log - moving_avg
ts_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(ts_log_moving_avg_diff)
The test statistic is smaller than the 1% critical value so with 99% confidence we can say that the series is stationary
expwighted_avg = pd.ewma(ts_log, halflife=12)
plt.plot(ts_log)
plt.plot(expwighted_avg, color='red')
ts_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(ts_log_ewma_diff)
This also results in a test statistic which is smaller than the 1% critical value
ts_log_diff = ts_log - ts_log.shift()
plt.plot(ts_log_diff)
ts_diff = UK_US['UK_US'] - UK_US['UK_US'].shift()
ts_log_diff.dropna(inplace=True)
test_stationarity(ts_log_diff)
The test statistic is much lower than the 1% critical value (-16 comared to -3)
The ARIMA forecasting for a stationary time series is nothing but a linear (like a linear regression) equation. The predictors depend on the parameters (p,d,q) of the ARIMA model:
An importance concern here is how to determine the value of ‘p’ and ‘q’. We use two plots to determine these numbers.
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_diff, nlags=20)
lag_pacf = pacf(ts_log_diff, nlags=20, method='ols')
#Plot ACF:
plt.figure()
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
The two dotted lines on either sides of 0 are the confidence intervals. These can be used to determine the ‘p’ and ‘q’ values for the ARIMA model as:
There are three models we could try
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(ts_log, order=(6, 0, 0))
results_AR = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(6, 0, 0), exog=libor)
results_AR = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(0,0, 2))
results_MA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(0,0, 2), exog = libor)
results_MA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(6, 0, 5))
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log)**2))
model = ARIMA(ts_log, order=(6, 0, 4), exog=libor)
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log)**2))
## adjusted R2
# # trend = sm.tsa.arma_order_select_ic(ts, ic=['aic', 'bic'])
# model = statsmodels.tsa.arima_model.ARMA(ts_diff, trend.aic_min_order)
# results_ARMA = model.fit(disp=-1)
# plt.plot(ts_diff)
# plt.plot(results_ARMA.fittedvalues, color='red')
# plt.title('RSS: %.4f'% sum((results_ARMA.fittedvalues-ts_diff)**2))
def transform_back(results):
predictions_diff = pd.Series(results.fittedvalues, copy=True)
# print predictions_diff[0:10]
predictions_diff_cumsum = predictions_diff.cumsum()
# print predictions_diff_cumsum[0:10]
predictions_diff = pd.Series(ts_log.ix[0], index=ts_log.index)
predictions_diff = predictions_diff.add(predictions_diff_cumsum,fill_value=0)
predictions_diff_exp = np.exp(predictions_diff)
# print predictions_diff_exp[0:10]
# print ts[0:10]
# return predictions_diff_exp
return np.exp(results.fittedvalues)
plt.figure(figsize=(20,15))
plt.plot(UK_US['UK_US'])
# plt.plot(transform_back(results_ARMA), label='ARMA')
plt.plot(transform_back(results_ARIMA), label='ARIMA')
plt.plot(transform_back(results_MA), label='MA')
plt.plot(transform_back(results_AR), label='AR')
plt.legend()
# plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA_exp-UK_US['UK_US'])**2)/len(UK_US['UK_US'])))
def score(y, yhat):
return sum((yhat-y)**2)
def forecast_cumulatively(ts):
plt.figure(figsize=(50,100))
j=0
for year in range(2001, 2015):
ts_train = ts[str(2000):str(year)+str('-06')] # first 6 months of the year
ts_years = ts[str(2000):str(year)]
ts_forecast_period = ts[str(year)+str('-07'): str(year)+str('-12')]
model = ARIMA(ts_train, order=(6, 0, 0))
results_ARIMA = model.fit(disp=-1)
n_weeks = len(ts_forecast_period)
forecast = results_ARIMA.forecast(n_weeks)[0] # 6 month forecast
forecast_series = pd.DataFrame({'forecast':forecast})
forecast_series = forecast_series.set_index(ts_forecast_period.index)
j+=1
plt.subplot(8, 2, j)
plt.plot(ts_years, label='us/uk exchange rate')
plt.plot(results_ARIMA.fittedvalues, 'r--')
plt.plot(forecast_series['forecast'], 'r--', label='forecast')
plt.title('RMSE: %.4f'% score(ts_forecast_period, forecast), fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
# plt.title('RMSE', score(ts_forecast_period.values, forecast))
# lags_moving_avs = np.zeros((7, 7))
# for lag in range(0,7):
# for ma in range(0,7):
# model = ARIMA(ts_train, order=(lag, 0, ma))
# results_ARIMA = model.fit(disp=-1)
# forecast = results_ARIMA.forecast(18) # 6 month forecast
# plt.subplot(ts_years)
# plt.plot(forecast)
# lags_moving_avs[lag, ma] = score(forecast, ts_forecast_period)
# return lags_moving_avs
# forecast_cumulatively(ts)
def forecast_by_year(ts):
plt.figure(figsize=(50,100))
j=0
for year in range(2001, 2015):
ts_train = ts[str(year)+str('-01'):str(year)+str('-09')] # first 6 months of the year
ts_forecast_period = ts[str(year)+str('-10'): str(year)+str('-12')]
model = ARIMA(ts_train, order=(5, 0, 0))
results_ARIMA = model.fit(disp=-1)
n_weeks = len(ts_forecast_period)
forecast = results_ARIMA.forecast(n_weeks)[0] # 6 month forecast
forecast_series = pd.DataFrame({'forecast':forecast})
forecast_series = forecast_series.set_index(ts_forecast_period.index)
j+=1
plt.subplot(8, 2, j)
plt.plot(ts[str(year)], label='us/uk exchange rate')
plt.plot(results_ARIMA.fittedvalues, 'g--', label='fitted values')
plt.plot(forecast_series['forecast'], 'r--', label='forecast')
plt.title('RMSE: %.4f'% score(ts_forecast_period, forecast), fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
forecast_by_year(ts)
def forecast_by_year_libor(ts):
plt.figure(figsize=(50,100))
j=0
for year in range(2001, 2015):
ts_train = ts[str(year)+str('-01'):str(year)+str('-09')] # first 6 months of the year
ts_forecast_period = ts[str(year)+str('-10'): str(year)+str('-12')]
model = ARIMA(ts_train, order=(5, 0, 0), exog=libor)
results_ARIMA = model.fit(disp=-1)
n_weeks = len(ts_forecast_period)
forecast = results_ARIMA.forecast(n_weeks)[0] # 6 month forecast
forecast_series = pd.DataFrame({'forecast':forecast})
forecast_series = forecast_series.set_index(ts_forecast_period.index)
j+=1
plt.subplot(8, 2, j)
plt.plot(ts[str(year)], label='us/uk exchange rate')
plt.plot(results_ARIMA.fittedvalues, 'g--', label='fitted values')
plt.plot(forecast_series['forecast'], 'r--', label='forecast')
plt.title('RMSE: %.4f'% score(ts_forecast_period, forecast), fontsize=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
forecast_by_year(ts)
Current model uses 3 lags, and first difference, with no moving average terms.
def forecast_variable_horizon(ts, horizon, lags=3):
# start predictions at 2006
ts_2006 = ts['2006':]
n_forecasts = len(ts_2006)
actual_values = np.append(ts_2006.values, [0])
# initialise empty array for forecast
forecasts = np.zeros((n_forecasts+1, horizon+1))
confidence_intervals = np.zeros((n_forecasts+1, 2, horizon+1))
# choose a 5 year rolling window for predictions so current week - 5 years
five_years = datetime.timedelta(weeks=260)
# start at 2005
for week_i, week in enumerate(ts_2006.index):
# take the past 5 years as the training set
ts_train = ts[week-five_years: week]
model = ARIMA(ts_train, order=(lags, 1, 0))
results_ARIMA = model.fit(disp=-1)
forecast = results_ARIMA.forecast(horizon) # forecast for the horizon
forecast_vals = forecast[0]
forecast_conf_intervals = forecast[2]
for f in range(1,horizon+1):
if week_i <= n_forecasts - (f):
forecasts[week_i+f, f] = forecast_vals[f-1]
confidence_intervals[week_i+f, :, f] = forecast_conf_intervals[f-1,:]
# start the prediction for the next week so the first prediction value will be zero as we
# don't predict for the first week
newtime = np.datetime64(ts_2006.index[-1]+datetime.timedelta(weeks=horizon))
new_time_index = pd.DatetimeIndex(np.append(ts_2006.index, newtime))
forecast_df = pd.DataFrame({'actual': actual_values}, index=new_time_index)
for col in range(horizon):
forecast_df[col+1] = forecasts[:, col+1]
return forecast_df, confidence_intervals
forecast_variable_horizon_df, confidence_intervals = forecast_variable_horizon(ts, 2)
forecast_variable_horizon_df.head()
confidence_intervals_reindexed = confidence_intervals[1:]
confidence_intervals_reindexed[:,:,1]
horizon = 2
plt.figure(figsize=(50,80))
j=0
for fc in range(1, horizon+1):
j+=1
plt.subplot(3, 1, j)
plt.plot(forecast_variable_horizon_df['actual'][:-1], 'g', linewidth=2, label='actual values')
plt.plot(forecast_variable_horizon_df.index[fc:], forecast_variable_horizon_df[fc][fc:], 'r--', linewidth=2, label=str(fc) +' week forecast')
actual = forecast_variable_horizon_df['actual'][fc:]
# confidence bounds:
lower = confidence_intervals[:,:,1][fc:, 0]
upper = confidence_intervals[:,:,1][fc:, 1]
plt.plot(forecast_variable_horizon_df.index[fc:], lower, 'c--', label='lower confidence bound')
plt.plot(forecast_variable_horizon_df.index[fc:], upper, 'c--', label='upper confidence bound')
predicted = forecast_variable_horizon_df[fc][fc:]
rmse = score(actual, predicted)
r2 = r2_score(actual, predicted)
plt.title('Forecast horizon ' + str(fc) + ' weeks: MSE: %.4f '% rmse + ' R2: %.4f' % r2, fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.legend(fontsize=30, loc='best')
def forecast_variable_horizon_libor(ts, horizon, lags=3):
# start predictions at 2006
ts_2006 = ts['2006':]
n_forecasts = len(ts_2006)
actual_values = np.append(ts_2006.values, [0])
# initialise empty array for forecast
forecasts = np.zeros((n_forecasts+1, horizon+1))
confidence_intervals = np.zeros((n_forecasts+1, 2, horizon+1))
# choose a 5 year rolling window for predictions so current week - 5 years
five_years = datetime.timedelta(weeks=30)
# start at 2005
for week_i, week in enumerate(ts_2006.index):
# take the past 5 years as the training set
ts_train = ts[week-five_years: week]
model = ARIMA(ts_train, order=(lags, 1, 0), exog=libor)
results_ARIMA = model.fit(disp=-1)
forecast = results_ARIMA.forecast(horizon) # forecast for the horizon
forecast_vals = forecast[0]
forecast_conf_intervals = forecast[2]
for f in range(1,horizon+1):
if week_i <= n_forecasts - (f):
forecasts[week_i+f, f] = forecast_vals[f-1]
confidence_intervals[week_i+f, :, f] = forecast_conf_intervals[f-1,:]
# start the prediction for the next week so the first prediction value will be zero as we
# don't predict for the first week
newtime = np.datetime64(ts_2006.index[-1]+datetime.timedelta(weeks=horizon))
new_time_index = pd.DatetimeIndex(np.append(ts_2006.index, newtime))
forecast_df = pd.DataFrame({'actual': actual_values}, index=new_time_index)
for col in range(horizon):
forecast_df[col+1] = forecasts[:, col+1]
return forecast_df, confidence_intervals
#??? INDICES NOT ALIGNED
forecast_window = forecast_variable_horizon_df['2007':'2009']
forecast_window.head()
def plot_prediction_window(forecast_variable_horizon_df, start_year, end_year):
plt.figure(figsize=(20,10))
start_index = len(forecast_variable_horizon_df['actual']) - len(forecast_variable_horizon_df['actual'][start_year:])
end_index = start_index + len(forecast_variable_horizon_df['actual'][start_year:end_year])
# slice the predictions dataframe:
forecast_window = forecast_variable_horizon_df[start_year:end_year]
# set forecast horizon
fc = 2
# plot the actual values and the forecast
plt.plot(forecast_window['actual'][:-1], 'g', linewidth=2, label='actual values')
plt.plot(forecast_window.index, forecast_window[fc], 'r--', linewidth=2, label=str(fc) +' week forecast')
# plot the confidence bounds:
lower = confidence_intervals[:,:,fc][start_index:end_index, 0]
upper = confidence_intervals[:,:,fc][start_index:end_index, 1]
plt.plot(forecast_window.index, lower, 'c--', label='lower confidence bound')
plt.plot(forecast_window.index, upper, 'c--', label='upper confidence bound')
predicted = forecast_window[fc]
actual = forecast_window['actual']
# score the predictions
rmse = score(actual, predicted)
r2 = r2_score(actual, predicted)
# set plote legend
plt.title('Forecast horizon ' + str(fc) + ' weeks: MSE: %.4f '% rmse + ' R2: %.4f' % r2, fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20, loc='best')
ts_train = ts['2002':'2007']
model = ARIMA(ts_train, order=(5, 1, 0))
results_ARIMA = model.fit(disp=-1)
results_ARIMA.summary()
forecast_window = forecast_variable_horizon_df['2006': '2007']
plt.hist(((forecast_window['actual']-forecast_window[1])), bins=100)
plt.xlim([-0.2,0.2])
forecast_window = forecast_variable_horizon_df['2008': '2009']
plt.hist(((forecast_window['actual']-forecast_window[1])), bins=50)
plt.xlim([-0.2,0.2])
# results_ARIMA.summary()
start_year = '2008'
end_year = '2010'
plot_prediction_window(forecast_variable_horizon_df, start_year, end_year)
start_year = '2016'
end_year = '2016'
plot_prediction_window(forecast_variable_horizon_df, start_year, end_year)
start_year = '2013-04'
end_year = '2015-02'
plot_prediction_window(forecast_variable_horizon_df, start_year, end_year)
Same concept as before, but the number of lags is varied - the average r2 is calculated after the model is fit to each subset and then averaged for each value of lag. The lag value at which the r2 is the lowest can then be selected.
def roll_forward_cross_validate(ts, num_lags):
scores = np.zeros((num_lags))
for lag in range(1, num_lags+1):
forecast_df, conf_int = forecast_variable_horizon(ts, 1, lag)
scores[lag-1] = r2_score(forecast_df['actual'][1:], forecast_df[1][1:])
print 'lag ', lag, 'score ', scores[lag-1]
return scores
scores = roll_forward_cross_validate(ts, 5)
import math
def subset_ts(ts, n):
length = len(ts.values)
subset_length = math.floor((length/n))
indices = ts.index.values
subsets = []
for i in range(n):
start = subset_length*i
end = subset_length*(i+1)
subset_indices = indices[start:end]
subset = ts[subset_indices]
subsets.append(subset)
return subsets
[ts_1, ts_2, ts_3, ts_5, ts_5] = subset_ts(ts, 5)
def cross_validate_lags(lags, ts, k):
subsets = subset_ts(ts, k)
scores = np.zeros((len(lags), 1))
for lag in lags:
lag_r2 = []
for i, subset in enumerate(subsets):
model = ARIMA(subset, order=(lag, 1, 0))
results_ARIMA = model.fit(disp=-1)
other_subsets_r2 = []
for j, other_subset in enumerate(subsets):
if i != j:
subset_indices = other_subset.index
predictions = results_ARIMA.predict(start=subset_indices[0], end=subset_indices[-1])
other_subsets_r2.append(r2_score(other_subset['actual'], predictions))
lag_r2.append(np.mean(other_subset_r2))
print 'lag ', lag, 'r2 ', np.mean(other_subset_r2)
scores[i] = np.mean(lag_r2)
return scores
# scores = cross_validate_lags([1,2,3,4,5], ts, 5)
# start with an n x 20 matrix (i.e. for each time point, the columns are the lags at t=t-1, t=t-2 etc)
n_lags = 20
time_series_lags = np.zeros((len(ts), n_lags))
for n in range(1, n_lags+1):
shifted = ts.shift(n)
for index, val in enumerate(ts.values):
if index > n_lags-1:
time_series_lags[index, n-1] = shifted[index]
# start with an n x 20 matrix (i.e. for each time point, the columns are the lags at t=t-1, t=t-2 etc)
n_lags = 20
libor_series_lags = np.zeros((len(libor), n_lags))
for n in range(1, n_lags+1):
shifted = libor.shift(n)
for t, val in enumerate(libor.values):
if t > n_lags-1:
libor_series_lags[t, n-1] = shifted[t]
#np.savetxt("test.csv", time_series_lags, delimiter=",")
sentiment_scores = pd.read_csv('../data_collection/scores_df.csv', delimiter=',')
sentiment_scores = sentiment_scores.set_index(pd.DatetimeIndex(sentiment_scores['weeks']))
sentiment_scores.drop('weeks', axis=1, inplace=True)
print sentiment_scores.index.unique()
print ts.index.unique()
new_index = sentiment_scores.apply(lambda row: row.index + datetime.timedelta(days=4))['avg_weekly_pos_score']
sentiment_scores = sentiment_scores.set_index(new_index)
The exchange rate data is extracted on a thursday and in some weeks if there was no trading on a thursday then the exchange rate data is not present for that week in our extracted time series so there are fewer weeks in the exchange rate data than for articles. Need to make sure the weeks match in both sentiment scores and exchange rate time series.
print sentiment_scores.index.unique()
print ts.index.unique()
ts_df = pd.DataFrame(data=ts)
ts_df.head()
time_series_filtered = ts_df.merge(sentiment_scores, right_index=True, left_index=True)
pos = time_series_filtered['avg_weekly_pos_score']
neg = time_series_filtered['avg_weekly_neg_score']
ts_filtered = time_series_filtered['UK_US']
lag_acf = acf(pos, nlags=40)
lag_pacf = pacf(pos, nlags=40, method='ols')
#Plot ACF:
plt.figure(figsize=(15,10))
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(pos)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(pos)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(pos)),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(pos)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
n_lags = 20
pos_sentiment_lags = np.zeros((len(pos.values), n_lags))
for n in range(1, n_lags+1):
shifted = pos.shift(n)
for t, val in enumerate(pos.values):
if t > n_lags-1:
pos_sentiment_lags[t, n-1] = shifted[t]
n_lags = 20
neg_sentiment_lags = np.zeros((len(neg.values), n_lags))
for n in range(1, n_lags+1):
shifted = neg.shift(n)
for t, val in enumerate(neg.values):
if t > n_lags-1:
neg_sentiment_lags[t, n-1] = shifted[t]
n_lags = 20
time_series_filtered_lags = np.zeros((len(ts_filtered.values), n_lags))
for n in range(1, n_lags+1):
shifted = ts_filtered.shift(n)
for index, val in enumerate(ts_filtered.values):
if index > n_lags-1:
time_series_filtered_lags[index, n-1] = shifted[index]
def forecast_variable_horizon_exog(ts, horizon, sentiments, lags=3):
# start predictions at 2006
ts_2006 = ts['2006':]
n_forecasts = len(ts_2006)
actual_values = np.append(ts_2006.values, [0])
# initialise empty array for forecast
forecasts = np.zeros((n_forecasts+1, horizon+1))
confidence_intervals = np.zeros((n_forecasts+1, 2, horizon+1))
# choose a 5 year rolling window for predictions so current week - 5 years
five_years = datetime.timedelta(weeks=30)
# start at 2005
for week_i, week in enumerate(ts_2006.index):
# take the past 5 years as the training set
ts_train = ts[week-five_years: week]
# subset the sentiment scores as well
sentiment_train = []
for sentiment in sentiments:
sentiment_train.append(sentiment[week-five_years: week])
sentiment_train = np.array(sentiment_train).reshape((len(ts_train), len(sentiments)))
# train the model
model = ARIMA(ts_train, order=(lags, 1, 0), exog=sentiment_train)
results_ARIMA = model.fit(disp=-1)
# forecast
forecast = results_ARIMA.forecast(horizon) # forecast for the horizon
forecast_vals = forecast[0]
forecast_conf_intervals = forecast[2]
for f in range(1,horizon+1):
if week_i <= n_forecasts - (f):
forecasts[week_i+f, f] = forecast_vals[f-1]
confidence_intervals[week_i+f, :, f] = forecast_conf_intervals[f-1,:]
# start the prediction for the next week so the first prediction value will be zero as we
# don't predict for the first week
newtime = np.datetime64(ts_2006.index[-1]+datetime.timedelta(weeks=horizon))
new_time_index = pd.DatetimeIndex(np.append(ts_2006.index, newtime))
forecast_df = pd.DataFrame({'actual': actual_values}, index=new_time_index)
for col in range(horizon):
forecast_df[col+1] = forecasts[:, col+1]
return forecast_df, confidence_intervals
forecast_variable_horizon_df, confidence_intervals = forecast_variable_horizon_exog(ts, 1, [pos, neg], lags=4)
def ridge_predict(predictor_arrays):
predictors = np.concatenate(predictor_arrays, axis=1)
predictor_matrix_df = pd.DataFrame(data=predictors, index=ts.index.values)
predictor_matrix_df_trimmed = predictor_matrix_df[20:-20]
predictor_matrix_df_subset_1 = predictor_matrix_df_trimmed['2000':'2006']
predictor_matrix_df_subset_2 = predictor_matrix_df_trimmed['2007':'2016']
# split into train and test sets
test_indices = predictor_matrix_df_subset_1.sample(frac=0.4).index
train_indices = predictor_matrix_df_subset_1.drop(test_indices).index
print 'Xtrain shape ', train_indices.shape
print 'Xtest shape ', test_indices.shape
# sort test index values to help with plotting
sorted_test_indices = test_indices.order()
X_train = predictor_matrix_df_trimmed.ix[train_indices]
X_test = predictor_matrix_df_trimmed.ix[sorted_test_indices]
y_train = ts[train_indices].values
y_test = ts[sorted_test_indices].values
# fit ridge regression model
ridge_cv = linear_model.LassoCV(normalize=True, cv=30, fit_intercept=True)
ridge_cv.fit(X_train, y_train)
# plot coefficients
plt.figure(figsize=(15,10))
plt.plot(ridge_cv.coef_)
# plt.title('Lasso regression coefficients')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Lags', fontsize=20)
plt.ylabel('Coefficient', fontsize=20)
# validation set predictions
test_predictions = ridge_cv.predict(X_test)
# plot predictions
plt.figure(figsize=(15,10))
plt.scatter(sorted_test_indices, test_predictions, color='red', label='predictions')
plt.scatter(sorted_test_indices, y_test, label='actual')
plt.legend(loc='best', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Time (weeks)', fontsize=20)
plt.ylabel('Exchange rate', fontsize=20)
print 'score ', str(r2_score(y_test, test_predictions))
# plt.title('Test predictions, $R^2$ = ' + str(r2_score(y_test, test_predictions)))
## residual histogram
plt.figure(figsize=(15,10))
plt.hist((test_predictions-y_test), bins=100)
# plt.title('Residuals')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel('Residual Error', fontsize=20)
plt.ylabel('Frequency', fontsize=20)
return ridge_cv
ridge_model = ridge_predict([time_series_filtered_lags])
ridge_predict([time_series_filtered_lags, pos_sentiment_lags, neg_sentiment_lags])
def forecast(model, n, x_test, last_date):
predictions = []
# create future dates
week = datetime.timedelta(days=7)
dates = np.array([(last_date + n*week) for n in range(1, n+1)])
# set the x_test value
x_s = x_test
# for each forecast timestep
for t in range(0, n):
if t > 0:
# concatenate the prediction from the previous timestep to the current predictor set to get the lag values
# for the next timestep
x_s = np.concatenate((predictions, x_test), axis=0)
# take the first 20 values to predict the next timestep
x_s = x_s[0:20]
#predict based on the lags
p = np.array(model.predict(x_s))
# prepend to the forecasts array so the prediction will be in the correct order for the next prediction
if t > 0:
predictions = np.concatenate((p, predictions), axis=0)
else:
predictions = p
# reverse the predictions so they are in the correct order
return predictions[::-1], dates
def predict_n_weeks(n, ts_filtered):
#model is traned with 20 lags
num_lags = 20
colors = ['red', 'green', 'yellow', 'orange', 'magenta', 'blue']
# initialise
plt.figure(figsize=(15,15))
prediction_n_week = np.zeros((len(ts_filtered)-num_lags))
prediction_dates = []
# for each time step, predict n weeks ahead:
for i in range(num_lags, len(ts_filtered)):
last_date = ts_filtered.index[i]
# take last 20 values of df to generate forecasts for time i+1, i+2, i+3 etc
# (and reverse so it is in the order [t-1, t-2, t-3....]
x_test = ts_filtered.iloc[i-num_lags:i+1].values[::-1]
# array index from i-20:i+1 as otherwise value for time i is not inclusive
# forecast
predictions_t, dates_t = forecast(ridge_model, n, x_test, last_date)
plt.plot(dates_t, predictions_t, color=np.random.choice(colors))
plt.plot()
# save last value
prediction_n_week[i-num_lags] = predictions_t[-1]
date = dates_t[-1]
# print 'last_date', last_date, 'prev_value',x_test[0], 'next_date', date, 'prediction', predictions_t[-1]
prediction_dates.append(date)
plt.plot(ts_filtered.index, ts_filtered.values, linestyle='dashed')
plt.xlabel('Time (weeks)', fontsize=20)
plt.ylabel('Exchange Rate', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
return prediction_n_week, prediction_dates
sb.set_palette('muted')
ts_train = ts_filtered['2007':'2016']
ts_predict = ts_filtered['2007':'2016'].values[21:]
# 1 week look ahead forecast
prediction_1_week, prediction_dates_1 = predict_n_weeks(1, ts_train)
r2_1_week = r2_score(ts_predict, prediction_1_week[:-1])
print 'R2 1 week: ', r2_1_week
# # 5 week look ahead forecast
# prediction_5_week, prediction_dates_2 = predict_n_weeks(5, ts_train)
# # 10 week look ahead forecast
# prediction_10_week, prediction_dates_3 = predict_n_weeks(10, ts_train)
# 20 week look ahead forecast
prediction_20_week, prediction_dates_4 = predict_n_weeks(20, ts_train)
r2_20_week = r2_score(ts_predict, prediction_20_week[:-1])
print 'R2 20 week: ', r2_20_week
plt.figure(figsize=(15,15))
# actual values
plt.plot(ts_filtered.index, ts_filtered, linewidth=4, color='green')
# 1 week forecast
plt.plot(prediction_dates_1, prediction_1_week, color='red', linestyle='dashed', label='1 week forecast')
# plt.plot(prediction_dates_2, prediction_5_week, color='magenta', label='5 week forecast')
# plt.plot(prediction_dates_3, prediction_10_week, color='magenta', label='10 week forecast')
plt.plot(prediction_dates_4, prediction_20_week, color='orange', linestyle='dashed',label='20 week forecast')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.ylabel('UK/US Exchange rate', fontsize=20)
plt.xlabel('Time (weeks)', fontsize=20)
plt.legend(fontsize=20)
x_train = ts_filtered
x_test = x_train.iloc[-20:].values[::-1]
last_date = x_train.index[-1]
predictions, dates = forecast(ridge_model, 40, x_test, last_date)
plt.figure(figsize=(15,15))
plt.scatter(dates, predictions, color='red')
plt.scatter(ts_filtered.index, ts_filtered.values, color='blue')